Setup

Six basic functions:

# House scraping: get_df_suburb
get_df_suburb <- function(location = "2151/Parramatta/"){
  # adapted from https://embracingtherandom.com/r/web-scraping/rent-scraping/
  # determine how many pages to scroll through 
  tryCatch({
    location <- gsub("\\s+", "+", location)
    # print(location)
    url <- paste0("https://www.auhouseprices.com/sold/list/NSW/", 
                  location, 
                  "1/?type=townhouse&ymin=0&ymax=0&bmin=0&bmax=0&pmin=0&pmax=0&sort=date&kw=") # type set to townhouse, no other filtering
    # print(url)
    webpage <- read_html(url)
    
    # get the number of properties and the number of property displayed on each page 
    find_page_number <- webpage  %>%  html_nodes("h2") %>%  html_text() 
    find_page_number <- find_page_number[1]
    numbers <- as.numeric(regmatches(find_page_number, gregexpr("[0-9]+", find_page_number))[[1]])
    end_page <- ceiling(numbers[3] / numbers[2]) # number of total properties / number on page  = total number of pages
    
    df <- NULL
    
    # print(paste0(location, ": begins 0/4"))
    # print(paste0( "Current suburb: ", location) )
    # print(paste0( "Total pages ", end_page) )
    
    for (this_page in c(1:end_page)){
    # # print(paste0( "Processing page ", this_page) )
    if (this_page %% 5 == 0){
      # print(paste0("Page processed: ", this_page, "/", end_page))
    }
    
    # get website text
    url <- paste0("https://www.auhouseprices.com/sold/list/NSW/", 
                  location, 
                  this_page, 
                  "/?type=townhouse&ymin=0&ymax=0&bmin=0&bmax=0&pmin=0&pmax=0&sort=date&kw=") # type set to townhouse, no other filtering 
    webpage <- read_html(url)
    
    result <- webpage  %>%  html_nodes("li") %>%  html_text() 
    # end of the relevant content 
    result <-  result[ 1: grep("current", result) ]
    # remove the redundant "listed price" 
    result <-  result[ !grepl("List", result) ]
    # remove the price listed with rent
    result <-  result[ !grepl("Rent", result) ]
    
    # filter information on price and number of bedroom/bathroom/carspace
    price_bedroom  <- result[ grep("\\$", result)]
    price_bedroom <- strsplit( price_bedroom , "\\$")
    bedroom <- lapply(price_bedroom, `[`, 1)
    bedroom <- strsplit(unlist( trimws( bedroom) ) , "\\s+")
    
    price <-  lapply(price_bedroom, `[`, 2)
    price <- trimws(price)
    price <- as.numeric(gsub(",","", price ))
    
    # filter information on sold month and year
    # note sometimes the price is not listed , therefore only get the ones with the price 
    timesold  <- result[ grep("\\$", result)-1]
    timesold <-  trimws( gsub("Sold on","", timesold )) 
    
    # whether to use day month year or just month year
    timesold <- lapply(timesold , function(x){
      check_format <- strsplit(x, "\\s")
      if (length(check_format[[1]]) == 3){
        x <- dmy(x)
      }else if (length(check_format[[1]]) == 2){
        x <- my(x)
      }else{
        x <-  as.Date(paste0(x, "-01-01"))
      }
      x
    })
    timesold <- do.call("c", timesold)
    
    # get address of these properties
    address <- webpage  %>%  html_nodes("h4") %>%  html_text() 
    # end of the relevant content 
    address <-  address[ 1: grep("Auction History", address) -1 ]
    
    #decide which address contain sold price  
    sold_info <- grep("Sold on", result) #entry with sold info
    price_info <- grep("\\$", result) #entry with price info
    contain_price <- sold_info  %in% c(price_info-1) #for every sold entry, the immediate next row should be price, if not, then this sold entry does not have price record 
    address <- address[contain_price] #only record those property that has price recorded
    
    temp_df <- data.frame( address = address, 
                           bedroom = as.numeric( unlist( lapply( bedroom, `[`, 1) ) ) , 
                           bathroom = as.numeric(  unlist( lapply( bedroom, `[`, 2) )) ,  
                           carspace =  as.numeric( unlist( lapply( bedroom, `[`, 3) )), 
                           soldprice = price ,
                           yearsold =timesold )
    
    df <- rbind(df, temp_df)
  }
  # Borrowed from ChatGPT
  # create a new column called "index" with a sequence of numbers
  df <- df %>% mutate(House_ID = 1:nrow(.))
  # move the "index" column to the front of the data frame
  df <- df[, c("House_ID", names(df)[-ncol(df)])]
  
  # print(paste0("Page processed: ", this_page, "/", end_page))
  # print(paste0(location, ": 1/4: get_df_suburb: creating data frame done!"))
  return(df)
  }, error = function(e) {
    # Error handling code
    # Set the file path and name
    file_path <- "main1_export_Brandon_log/"
    file_name <- "main1_export_Brandon_log.txt"
    
    # Create the directory if it doesn't exist
    if(!dir.exists(file_path)){
      dir.create(file_path)
    }
    
    # Write location to the file
    write(location, file.path(file_path, file_name), append = TRUE)
    return(NULL)
  })
}


add_distance_between <- function(lat, lon, fixed_lat, fixed_lon) {
  dist <- distHaversine(c(lon, lat), c(fixed_lon, fixed_lat))
  return(dist)
}  


get_l_suburb_dist <- function(df_suburb, suburb_lat, suburb_lon, location) {
  l_suburb <- df_suburb %>% geocode(address, method = 'arcgis', lat=latitude, long=longitude)
  # print(paste0(location, ": 2/4: get_l_suburb: done!"))
  l_suburb_dist <- data.frame(
    l_suburb, distance_to_train_station = apply(
      l_suburb[,c("latitude","longitude")], 1, function(x) add_distance_between(x[1], x[2], suburb_lat, suburb_lon))
  )
  # print(paste0(location, ": 3/4: get_l_suburb_dist: done!"))
  return(l_suburb_dist)
}


export_l_suburb_dist_csv <- function(location, l_suburb_dist) {
  # Writing the `l_granville_houseprice.csv` file in "~/csv_cache/"
  file_name <- paste0("l_", gsub("/", "_", location), "houseprice.csv")
  # print(file_name)
  file_path <- file.path("csv_cache", file_name)  # specify file path
  write.csv(l_suburb_dist, file_path, row.names = FALSE)  # export as CSV file
  # print(paste0(location, ": 4/4: export_l_suburb_dist_csv: done!"))
  return("Result: csv export finished")
}


export_a_suburb <- function(location, suburb_lat, suburb_lon) {
  df_suburb <- get_df_suburb(location)
  
  # Check if df_suburb is NULL (meaning an error occurred in get_df_suburb)
  if (is.null(df_suburb)) {
    return(NULL)
  }
  
  l_suburb_dist <- get_l_suburb_dist(df_suburb, suburb_lat, suburb_lon, location)
  export_l_suburb_dist_csv(location, l_suburb_dist)
  # print(paste0(location, ": Finish csv export"))
}


clear_log <- function() {
  # Set the file path and name
  file_path <- "main1_export_Brandon_log"
  file_name <- "main1_export_Brandon_log.txt"
  
  # Check if file exists before removing it
  if (file.exists(file.path(file_path, file_name))) {
    file.remove(file.path(file_path, file_name))
  }
}


# Modified export_all_suburbs function with progress bar
export_all_suburbs <- function(file_name) {
  cat("Exporting into csv_cache/ begins:\n")
  # Clear the log
  a <- clear_log()
  # create directory if it doesn't exist
  if (!dir.exists("~/csv_cache")) {
  dir.create("csv_cache")  
  }

  # Read the input file
  suburbs_input <- read.table(file_name, header = FALSE, sep = ",", col.names = c("location", "latitude", "longitude"), strip.white = TRUE, comment.char = "", quote = "")
  
  # Filter out rows starting with a '#' character
  suburbs_input <- suburbs_input[!grepl("^#", suburbs_input$location), ]
  
  # Randomize the order of rows
  random_order <- sample(nrow(suburbs_input))
  suburbs_input <- suburbs_input[random_order, ]
  
  # Loop through each row in the input file and call export_a_suburb function
  for (i in 1:nrow(suburbs_input)) {
    location <- as.character(suburbs_input[i, "location"])
    latitude <- as.numeric(suburbs_input[i, "latitude"])
    longitude <- as.numeric(suburbs_input[i, "longitude"])
    
    export_a_suburb(location, latitude, longitude)
    
    # Print progress bar
    progress <- i / nrow(suburbs_input)
    num_hashes <- floor(progress * 100 / 2) # Assuming each '#' represents 2% of the progress
    num_spaces <- 50 - num_hashes # Assuming the progress bar has 50 characters in total
    cat("\n")
    cat(sprintf("#%s%s (%.0f%%)\n", paste(rep("#", num_hashes), collapse = ""), paste(rep(" ", num_spaces), collapse = ""), progress * 100))
  }
  
  return(NULL)
}

Export all

export_all_suburbs("main1_INPUT.txt")
## Exporting into csv_cache/ begins:
## Warning in dir.create("csv_cache"): 'csv_cache' already exists
## Passing 284 addresses to the ArcGIS single address geocoder
## Query completed in: 155.4 seconds
## 
## #                                                   (1%)
## Passing 170 addresses to the ArcGIS single address geocoder
## Query completed in: 92.3 seconds
## 
## #                                                   (1%)
## 
## #                                                   (2%)
## Passing 15 addresses to the ArcGIS single address geocoder
## Query completed in: 6.6 seconds
## 
## ##                                                  (3%)
## Passing 128 addresses to the ArcGIS single address geocoder
## Query completed in: 69.9 seconds
## 
## ##                                                  (3%)
## 
## ##                                                  (4%)
## 
## ###                                                 (5%)
## Passing 189 addresses to the ArcGIS single address geocoder
## Query completed in: 104.7 seconds
## 
## ###                                                 (5%)
## Passing 102 addresses to the ArcGIS single address geocoder
## Query completed in: 55.8 seconds
## 
## ###                                                 (6%)
## 
## ####                                                (6%)
## Passing 96 addresses to the ArcGIS single address geocoder
## Query completed in: 54.9 seconds
## 
## ####                                                (7%)
## 
## ####                                                (8%)
## Passing 492 addresses to the ArcGIS single address geocoder
## Query completed in: 276.9 seconds
## 
## #####                                               (8%)
## Passing 189 addresses to the ArcGIS single address geocoder
## Query completed in: 112.4 seconds
## 
## #####                                               (9%)
## Passing 124 addresses to the ArcGIS single address geocoder
## Query completed in: 68.4 seconds
## 
## #####                                               (10%)
## Passing 965 addresses to the ArcGIS single address geocoder
## Query completed in: 534.7 seconds
## 
## ######                                              (10%)
## Passing 277 addresses to the ArcGIS single address geocoder
## Query completed in: 140.4 seconds
## 
## ######                                              (11%)
## Passing 78 addresses to the ArcGIS single address geocoder
## Query completed in: 42.2 seconds
## 
## ######                                              (12%)
## Passing 109 addresses to the ArcGIS single address geocoder
## Query completed in: 61.9 seconds
## 
## #######                                             (12%)
## Passing 9 addresses to the ArcGIS single address geocoder
## Query completed in: 5 seconds
## 
## #######                                             (13%)
## Passing 179 addresses to the ArcGIS single address geocoder
## Query completed in: 99.5 seconds
## 
## #######                                             (14%)
## 
## ########                                            (14%)
## 
## ########                                            (15%)
## Passing 125 addresses to the ArcGIS single address geocoder
## Query completed in: 67.5 seconds
## 
## ########                                            (16%)
## Passing 145 addresses to the ArcGIS single address geocoder
## Query completed in: 81.2 seconds
## 
## #########                                           (16%)
## Passing 354 addresses to the ArcGIS single address geocoder
## Query completed in: 189 seconds
## 
## #########                                           (17%)
## Passing 21 addresses to the ArcGIS single address geocoder
## Query completed in: 9.6 seconds
## 
## #########                                           (18%)
## Passing 270 addresses to the ArcGIS single address geocoder
## Query completed in: 148.8 seconds
## 
## ##########                                          (18%)
## Passing 38 addresses to the ArcGIS single address geocoder
## Query completed in: 19.3 seconds
## 
## ##########                                          (19%)
## Passing 138 addresses to the ArcGIS single address geocoder
## Query completed in: 79.6 seconds
## 
## ##########                                          (19%)
## Passing 387 addresses to the ArcGIS single address geocoder
## Query completed in: 214.7 seconds
## 
## ###########                                         (20%)
## Passing 338 addresses to the ArcGIS single address geocoder
## Query completed in: 192.6 seconds
## 
## ###########                                         (21%)
## Passing 105 addresses to the ArcGIS single address geocoder
## Query completed in: 54.2 seconds
## 
## ###########                                         (21%)
## Passing 143 addresses to the ArcGIS single address geocoder
## Query completed in: 76.9 seconds
## 
## ############                                        (22%)
## 
## ############                                        (23%)
## Passing 37 addresses to the ArcGIS single address geocoder
## Query completed in: 19.7 seconds
## 
## ############                                        (23%)
## Passing 63 addresses to the ArcGIS single address geocoder
## Query completed in: 36.2 seconds
## 
## #############                                       (24%)
## Passing 294 addresses to the ArcGIS single address geocoder
## Query completed in: 161.1 seconds
## 
## #############                                       (25%)
## Passing 47 addresses to the ArcGIS single address geocoder
## Query completed in: 24.7 seconds
## 
## #############                                       (25%)
## Passing 97 addresses to the ArcGIS single address geocoder
## Query completed in: 53.1 seconds
## 
## #############                                       (26%)
## Passing 86 addresses to the ArcGIS single address geocoder
## Query completed in: 49.8 seconds
## 
## ##############                                      (27%)
## Passing 716 addresses to the ArcGIS single address geocoder
## Query completed in: 387.4 seconds
## 
## ##############                                      (27%)
## Passing 1,173 addresses to the ArcGIS single address geocoder
## Query completed in: 645.1 seconds
## 
## ##############                                      (28%)
## Passing 124 addresses to the ArcGIS single address geocoder
## Query completed in: 64.5 seconds
## 
## ###############                                     (29%)
## Passing 79 addresses to the ArcGIS single address geocoder
## Query completed in: 40.3 seconds
## 
## ###############                                     (29%)
## Passing 54 addresses to the ArcGIS single address geocoder
## Query completed in: 24.6 seconds
## 
## ###############                                     (30%)
## Passing 52 addresses to the ArcGIS single address geocoder
## Query completed in: 27.9 seconds
## 
## ################                                    (31%)
## Passing 259 addresses to the ArcGIS single address geocoder
## Query completed in: 147.4 seconds
## 
## ################                                    (31%)
## Passing 69 addresses to the ArcGIS single address geocoder
## Query completed in: 39.4 seconds
## 
## ################                                    (32%)
## Passing 450 addresses to the ArcGIS single address geocoder
## Query completed in: 237.4 seconds
## 
## #################                                   (32%)
## Passing 867 addresses to the ArcGIS single address geocoder
## Query completed in: 476.2 seconds
## 
## #################                                   (33%)
## 
## #################                                   (34%)
## Passing 317 addresses to the ArcGIS single address geocoder
## Query completed in: 171.7 seconds
## 
## ##################                                  (34%)
## Passing 99 addresses to the ArcGIS single address geocoder
## Query completed in: 51.6 seconds
## 
## ##################                                  (35%)
## Passing 414 addresses to the ArcGIS single address geocoder
## Query completed in: 226.8 seconds
## 
## ##################                                  (36%)
## Passing 106 addresses to the ArcGIS single address geocoder
## Warning in query_api(api_url, api_query_parameters, method = method): Gateway
## Timeout (HTTP 504).
## Error: <html>
## <head><title>504 Gateway Time-out</title></head>
## <body>
## <center><h1>504 Gateway Time-out</h1></c
## Warning in query_api(api_url, api_query_parameters, method = method): Gateway
## Timeout (HTTP 504).
## Error: <html>
## <head><title>504 Gateway Time-out</title></head>
## <body>
## <center><h1>504 Gateway Time-out</h1></c
## Query completed in: 112.1 seconds
## 
## ###################                                 (36%)
## Passing 97 addresses to the ArcGIS single address geocoder
## Query completed in: 53.1 seconds
## 
## ###################                                 (37%)
## 
## ###################                                 (38%)
## Passing 190 addresses to the ArcGIS single address geocoder
## Query completed in: 102.1 seconds
## 
## ####################                                (38%)
## Passing 13 addresses to the ArcGIS single address geocoder
## Query completed in: 8 seconds
## 
## ####################                                (39%)
## Passing 145 addresses to the ArcGIS single address geocoder
## Query completed in: 83.5 seconds
## 
## ####################                                (40%)
## Passing 42 addresses to the ArcGIS single address geocoder
## Query completed in: 20.7 seconds
## 
## #####################                               (40%)
## 
## #####################                               (41%)
## Passing 543 addresses to the ArcGIS single address geocoder
## Query completed in: 305.8 seconds
## 
## #####################                               (42%)
## Passing 95 addresses to the ArcGIS single address geocoder
## Query completed in: 51.9 seconds
## 
## ######################                              (42%)
## Passing 54 addresses to the ArcGIS single address geocoder
## Query completed in: 26.5 seconds
## 
## ######################                              (43%)
## Passing 49 addresses to the ArcGIS single address geocoder
## Query completed in: 26.4 seconds
## 
## ######################                              (44%)
## Passing 65 addresses to the ArcGIS single address geocoder
## Query completed in: 36.7 seconds
## 
## #######################                             (44%)
## 
## #######################                             (45%)
## Passing 153 addresses to the ArcGIS single address geocoder
## Query completed in: 88.4 seconds
## 
## #######################                             (45%)
## Passing 54 addresses to the ArcGIS single address geocoder
## Query completed in: 26.8 seconds
## 
## ########################                            (46%)
## Passing 367 addresses to the ArcGIS single address geocoder
## Query completed in: 189.8 seconds
## 
## ########################                            (47%)
## Passing 186 addresses to the ArcGIS single address geocoder
## Query completed in: 102.8 seconds
## 
## ########################                            (47%)
## Passing 571 addresses to the ArcGIS single address geocoder
## Query completed in: 316.6 seconds
## 
## #########################                           (48%)
## Passing 21 addresses to the ArcGIS single address geocoder
## Query completed in: 9.4 seconds
## 
## #########################                           (49%)
## 
## #########################                           (49%)
## Passing 508 addresses to the ArcGIS single address geocoder
## Query completed in: 276 seconds
## 
## ##########################                          (50%)
## Passing 161 addresses to the ArcGIS single address geocoder
## Query completed in: 83.5 seconds
## 
## ##########################                          (51%)
## 
## ##########################                          (51%)
## Passing 310 addresses to the ArcGIS single address geocoder
## Query completed in: 1109.2 seconds
## 
## ##########################                          (52%)
## 
## ###########################                         (53%)
## 
## ###########################                         (53%)
## 
## ###########################                         (54%)
## Passing 49 addresses to the ArcGIS single address geocoder
## Query completed in: 29 seconds
## 
## ############################                        (55%)
## Passing 362 addresses to the ArcGIS single address geocoder
## Query completed in: 2419.8 seconds
## 
## ############################                        (55%)
## 
## ############################                        (56%)
## Passing 753 addresses to the ArcGIS single address geocoder
## Query completed in: 408.5 seconds
## 
## #############################                       (56%)
## 
## #############################                       (57%)
## Passing 156 addresses to the ArcGIS single address geocoder
## Query completed in: 82.5 seconds
## 
## #############################                       (58%)
## Passing 516 addresses to the ArcGIS single address geocoder
## Query completed in: 276.1 seconds
## 
## ##############################                      (58%)
## Passing 17 addresses to the ArcGIS single address geocoder
## Query completed in: 8.8 seconds
## 
## ##############################                      (59%)
## Passing 77 addresses to the ArcGIS single address geocoder
## Query completed in: 45.6 seconds
## 
## ##############################                      (60%)
## Passing 26 addresses to the ArcGIS single address geocoder
## Query completed in: 14.5 seconds
## 
## ###############################                     (60%)
## Passing 120 addresses to the ArcGIS single address geocoder
## Query completed in: 66.5 seconds
## 
## ###############################                     (61%)
## 
## ###############################                     (62%)
## Passing 65 addresses to the ArcGIS single address geocoder
## Query completed in: 36.9 seconds
## 
## ################################                    (62%)
## Passing 169 addresses to the ArcGIS single address geocoder
## Query completed in: 90.2 seconds
## 
## ################################                    (63%)
## 
## ################################                    (64%)
## Passing 17 addresses to the ArcGIS single address geocoder
## Query completed in: 8.6 seconds
## 
## #################################                   (64%)
## Passing 74 addresses to the ArcGIS single address geocoder
## Query completed in: 44.2 seconds
## 
## #################################                   (65%)
## Passing 150 addresses to the ArcGIS single address geocoder
## Query completed in: 79.3 seconds
## 
## #################################                   (66%)
## Passing 107 addresses to the ArcGIS single address geocoder
## Query completed in: 58 seconds
## 
## ##################################                  (66%)
## Passing 170 addresses to the ArcGIS single address geocoder
## Query completed in: 97.6 seconds
## 
## ##################################                  (67%)
## Passing 257 addresses to the ArcGIS single address geocoder
## Query completed in: 142.7 seconds
## 
## ##################################                  (68%)
## Passing 170 addresses to the ArcGIS single address geocoder
## Query completed in: 91.3 seconds
## 
## ###################################                 (68%)
## Passing 12 addresses to the ArcGIS single address geocoder
## Query completed in: 6 seconds
## 
## ###################################                 (69%)
## 
## ###################################                 (69%)
## Passing 110 addresses to the ArcGIS single address geocoder
## Query completed in: 62 seconds
## 
## ####################################                (70%)
## Passing 272 addresses to the ArcGIS single address geocoder
## Query completed in: 150.2 seconds
## 
## ####################################                (71%)
## 
## ####################################                (71%)
## Passing 280 addresses to the ArcGIS single address geocoder
## Query completed in: 146.3 seconds
## 
## #####################################               (72%)
## 
## #####################################               (73%)
## Passing 404 addresses to the ArcGIS single address geocoder
## Query completed in: 226.2 seconds
## 
## #####################################               (73%)
## Passing 66 addresses to the ArcGIS single address geocoder
## Query completed in: 32.6 seconds
## 
## ######################################              (74%)
## Passing 105 addresses to the ArcGIS single address geocoder
## Query completed in: 54 seconds
## 
## ######################################              (75%)
## 
## ######################################              (75%)
## Passing 38 addresses to the ArcGIS single address geocoder
## Query completed in: 18.8 seconds
## 
## ######################################              (76%)
## Passing 183 addresses to the ArcGIS single address geocoder
## Query completed in: 102.4 seconds
## 
## #######################################             (77%)
## Passing 31 addresses to the ArcGIS single address geocoder
## Query completed in: 17.3 seconds
## 
## #######################################             (77%)
## Passing 324 addresses to the ArcGIS single address geocoder
## Query completed in: 178.8 seconds
## 
## #######################################             (78%)
## Passing 20 addresses to the ArcGIS single address geocoder
## Query completed in: 11.8 seconds
## 
## ########################################            (79%)
## 
## ########################################            (79%)
## 
## ########################################            (80%)
## 
## #########################################           (81%)
## 
## #########################################           (81%)
## Passing 402 addresses to the ArcGIS single address geocoder
## Query completed in: 224.3 seconds
## 
## #########################################           (82%)
## Passing 84 addresses to the ArcGIS single address geocoder
## Query completed in: 46.8 seconds
## 
## ##########################################          (82%)
## Passing 159 addresses to the ArcGIS single address geocoder
## Query completed in: 92 seconds
## 
## ##########################################          (83%)
## Passing 65 addresses to the ArcGIS single address geocoder
## Query completed in: 32.1 seconds
## 
## ##########################################          (84%)
## Passing 410 addresses to the ArcGIS single address geocoder
## Query completed in: 222.8 seconds
## 
## ###########################################         (84%)
## Passing 431 addresses to the ArcGIS single address geocoder
## Query completed in: 237.5 seconds
## 
## ###########################################         (85%)
## 
## ###########################################         (86%)
## 
## ############################################        (86%)
## Passing 407 addresses to the ArcGIS single address geocoder
## Query completed in: 226 seconds
## 
## ############################################        (87%)
## Passing 35 addresses to the ArcGIS single address geocoder
## Query completed in: 18.9 seconds
## 
## ############################################        (88%)
## Passing 64 addresses to the ArcGIS single address geocoder
## Query completed in: 32.7 seconds
## 
## #############################################       (88%)
## Passing 103 addresses to the ArcGIS single address geocoder
## Query completed in: 55.5 seconds
## 
## #############################################       (89%)
## Passing 106 addresses to the ArcGIS single address geocoder
## Query completed in: 54.3 seconds
## 
## #############################################       (90%)
## Passing 11 addresses to the ArcGIS single address geocoder
## Query completed in: 6.1 seconds
## 
## ##############################################      (90%)
## Passing 91 addresses to the ArcGIS single address geocoder
## Query completed in: 50.4 seconds
## 
## ##############################################      (91%)
## Passing 42 addresses to the ArcGIS single address geocoder
## Query completed in: 24.5 seconds
## 
## ##############################################      (92%)
## Passing 88 addresses to the ArcGIS single address geocoder
## Query completed in: 47.2 seconds
## 
## ###############################################     (92%)
## Passing 456 addresses to the ArcGIS single address geocoder
## Query completed in: 259 seconds
## 
## ###############################################     (93%)
## 
## ###############################################     (94%)
## Passing 269 addresses to the ArcGIS single address geocoder
## Query completed in: 150.2 seconds
## 
## ################################################    (94%)
## Passing 25 addresses to the ArcGIS single address geocoder
## Query completed in: 13.1 seconds
## 
## ################################################    (95%)
## Passing 519 addresses to the ArcGIS single address geocoder
## Query completed in: 288.6 seconds
## 
## ################################################    (95%)
## 
## #################################################   (96%)
## 
## #################################################   (97%)
## 
## #################################################   (97%)
## Passing 47 addresses to the ArcGIS single address geocoder
## Query completed in: 27.2 seconds
## 
## ##################################################  (98%)
## Passing 33 addresses to the ArcGIS single address geocoder
## Query completed in: 17.3 seconds
## 
## ##################################################  (99%)
## Passing 34 addresses to the ArcGIS single address geocoder
## Query completed in: 19 seconds
## 
## ##################################################  (99%)
## Passing 277 addresses to the ArcGIS single address geocoder
## Query completed in: 156.8 seconds
## 
## ################################################### (100%)
## NULL

Reading in files

# Get the list of CSV files in the 'csv_cache' directory
csv_files <- list.files(path = "main2_INPUT", pattern = "*.csv", full.names = TRUE)

# Initialize an empty data frame to store the combined data
combined_df <- data.frame()

# Loop through each file in the csv_files list
for (file in csv_files) {
  # Read the CSV file
  location_data <- read.csv(file)
  
  # Categorize distance
  location_data$"distance_to_train_station(km)" <- location_data$distance_to_train_station / 1000
  
  # Classing distance
  location_data$distance_class <- cut(location_data$"distance_to_train_station(km)",
                                      breaks = c(0, 0.250, 0.500, 0.750, 1.000, 1.250, 1.500, 1.750, 2.000, 2.250, 2.500, 3.000, 3.250, 3.500, 3.750, 4.000))
  
  # Combine the processed data frame with the combined_df data frame
  combined_df <- rbind(combined_df, location_data)
}

# Inspect the combined data frame
head(combined_df)
##   House_ID                                address bedroom bathroom carspace
## 1        1  10/92 Buckland Street Alexandria 2015       3        2        2
## 2        2   6/92 Buckland Street Alexandria 2015       3        2        2
## 3        3 14/18-20 Newton Street Alexandria 2015       2        1        1
## 4        4   PG09/11 Power Avenue Alexandria 2015       2        2        1
## 5        5      28A Gerard Street Alexandria 2015       3        3       NA
## 6        6   1/92 Buckland Street Alexandria 2015       3        2        2
##   soldprice   yearsold  latitude longitude distance_to_train_station
## 1   2350000 2023-03-15 -33.90043  151.1947                  967.6896
## 2   2350000 2023-01-30 -33.90043  151.1947                  967.6896
## 3   1140000 2022-07-30 -33.89918  151.1909                 1325.9023
## 4   1550000 2022-07-06 -33.90129  151.1983                  669.3040
## 5   1230000 2022-06-18 -33.89771  151.1969                 1077.5788
## 6   2425000 2022-05-06 -33.90043  151.1947                  967.6896
##   distance_to_train_station(km) distance_class
## 1                     0.9676896       (0.75,1]
## 2                     0.9676896       (0.75,1]
## 3                     1.3259023     (1.25,1.5]
## 4                     0.6693040     (0.5,0.75]
## 5                     1.0775788       (1,1.25]
## 6                     0.9676896       (0.75,1]

Filtering Data

combined_df_1bed <-filter(combined_df, bedroom ==1)
combined_df_2bed <-filter(combined_df, bedroom ==2)
combined_df_3bed <-filter(combined_df, bedroom ==3)
combined_df_4bed <-filter(combined_df, bedroom ==4)
combined_df_5bed <-filter(combined_df, bedroom ==5)
par(mfrow=c(1,2))
ggplot(combined_df_1bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 1 Bedroom", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))+
  theme(plot.title = element_text(hjust=0.25))

ggplot(combined_df_1bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5, aes(fill=factor(carspace))) +
  labs(title = "Sold Price vs Distance from Train Station for 1 Bedroom", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))+
  theme(plot.title = element_text(hjust=0.25))

ggplot(combined_df_2bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 2 Bedrooms", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))+
  theme(plot.title = element_text(hjust=0.25))

summary(combined_df_2bed$soldprice)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 5.000e+04 3.450e+05 4.600e+05 7.725e+05 6.001e+05 2.147e+09
ggplot(combined_df_2bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5, aes(fill=factor(carspace))) +
  labs(title = "Sold Price vs Distance from Train Station for 2 Bedrooms", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))+
  theme(plot.title = element_text(hjust=0.25))

summary(combined_df_2bed$soldprice)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 5.000e+04 3.450e+05 4.600e+05 7.725e+05 6.001e+05 2.147e+09
ggplot(combined_df_3bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 3 Bedrooms", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))+
  theme(plot.title = element_text(hjust=0.25))

summary(combined_df_3bed$soldprice)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##      575   422000   570000   640598   745000 22867454
ggplot(combined_df_3bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5, aes(fill=factor(carspace))) +
  labs(title = "Sold Price vs Distance from Train Station for 3 Bedrooms", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))+
  theme(plot.title = element_text(hjust=0.25))

summary(combined_df_3bed$soldprice)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##      575   422000   570000   640598   745000 22867454
ggplot(combined_df_4bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 4 Bedrooms", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))+
  theme(plot.title = element_text(hjust=0.25))

summary(combined_df_4bed$soldprice)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     1650   531000   671000   760535   865000 15000000
ggplot(combined_df_4bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5, aes(fill=factor(carspace))) +
  labs(title = "Sold Price vs Distance from Train Station for 4 Bedrooms", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))+
  theme(plot.title = element_text(hjust=0.25))

summary(combined_df_4bed$soldprice)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     1650   531000   671000   760535   865000 15000000
ggplot(combined_df_5bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 5 Bedrooms", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))+
  theme(plot.title = element_text(hjust=0.25))

summary(combined_df_5bed$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  200000  656495  780000  857654  927500 3080000
ggplot(combined_df_5bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5, aes(fill=factor(carspace))) +
  labs(title = "Sold Price vs Distance from Train Station for 5 Bedrooms", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))+
  theme(plot.title = element_text(hjust=0.25))

summary(combined_df_5bed$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  200000  656495  780000  857654  927500 3080000

Filtering Data by Carspaces and Bedrooms

combined_df_1bed_1car <-filter(combined_df, bedroom ==1, carspace == 1)

combined_df_2bed_1car <-filter(combined_df, bedroom ==2, carspace == 1)

combined_df_2bed_2car <-filter(combined_df, bedroom ==2, carspace == 2)

combined_df_3bed_1car <-filter(combined_df, bedroom ==3, carspace == 1)

combined_df_3bed_2car <-filter(combined_df, bedroom ==3, carspace == 2)

combined_df_3bed_3car <-filter(combined_df, bedroom ==3, carspace == 3)

combined_df_3bed_4car <-filter(combined_df, bedroom ==3, carspace == 4)

combined_df_4bed_1car <-filter(combined_df, bedroom ==4, carspace == 1)

combined_df_4bed_2car <-filter(combined_df, bedroom ==4, carspace == 2)

combined_df_4bed_3car <-filter(combined_df, bedroom ==4, carspace == 3)

combined_df_4bed_4car <-filter(combined_df, bedroom ==4, carspace == 4)

combined_df_5bed_1car <-filter(combined_df, bedroom ==5, carspace == 1)

combined_df_5bed_2car <-filter(combined_df, bedroom ==5, carspace == 2)

combined_df_5bed_3car <-filter(combined_df, bedroom ==5, carspace == 3)

1 bedroom

ggplot(combined_df_1bed_1car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 1 Bedroom and 1 Carspace", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_1bed_1car$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   61325  252750  375000  401069  484450 1330000

2 bedrooms

ggplot(combined_df_2bed_1car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 2 Bedrooms and 1 Carspace", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_2bed_1car$soldprice)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 5.700e+04 3.350e+05 4.530e+05 8.359e+05 5.900e+05 2.147e+09
ggplot(combined_df_2bed_2car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 2 Bedrooms and 2 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_2bed_2car$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   66000  430000  545000  586774  700000 1581000

3 bedrooms

ggplot(combined_df_3bed_1car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 3 Bedrooms and 1 Carspace", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_3bed_1car$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   50000  356000  497000  546952  650000 5850000
ggplot(combined_df_3bed_2car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 3 Bedrooms and 2 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_3bed_2car$soldprice)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##      575   499000   635100   723215   839750 22867454
ggplot(combined_df_3bed_3car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 3 Bedrooms and 3 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_3bed_2car$soldprice)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##      575   499000   635100   723215   839750 22867454
ggplot(combined_df_3bed_4car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 3 Bedrooms and 4 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_3bed_4car$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  305000  478500  590000  668278  809000 1600000

4 bedrooms

ggplot(combined_df_4bed_1car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 4 Bedrooms and 1 Carspace", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_4bed_1car$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  175000  444000  615000  655876  760000 2750000
ggplot(combined_df_4bed_2car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 4 Bedrooms and 2 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_4bed_2car$soldprice)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     1650   555000   690000   789160   888250 15000000
ggplot(combined_df_4bed_3car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 4 Bedrooms and 3 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_4bed_3car$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  100000  615250  769000  927646 1023250 3000000
ggplot(combined_df_4bed_4car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 4 Bedrooms and 4 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_4bed_4car$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  330000  535500  655000  715113  840000 1630000

5 bedrooms

ggplot(combined_df_5bed_1car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 5 Bedrooms and 1 Carspace", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_5bed_1car$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  280000  487750  675000  716367  853000 1850000
ggplot(combined_df_5bed_2car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 5 Bedrooms and 2 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_5bed_2car$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  200000  665000  810000  898386  958888 3080000
ggplot(combined_df_5bed_3car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 5 Bedrooms and 3 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_5bed_3car$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  700000  738000  790000  787833  838250  910000

Creating a column for Year

combined_df$Year <- as.factor(format(as.Date(combined_df$yearsold), "%Y"))
# Filtering by year
combined_df_0.00 <-filter(combined_df, distance_class == "(0,0.25]")
combined_df_0.25 <-filter(combined_df, distance_class == "(0.25,0.5]")
combined_df_0.50 <-filter(combined_df, distance_class == "(0.5,0.75]")
combined_df_0.75 <-filter(combined_df, distance_class == "(0.75,1]")
combined_df_1.00 <-filter(combined_df, distance_class == "(1,1.25]")
combined_df_1.25 <-filter(combined_df, distance_class == "(1.25,1.5]")
combined_df_1.50 <-filter(combined_df, distance_class == "(1.5,1.75]")
combined_df_1.75 <-filter(combined_df, distance_class == "(1.75,2]")
combined_df_2.00 <-filter(combined_df, distance_class == "(2,2.25]")
combined_df_2.25 <-filter(combined_df, distance_class == "(2.25,2.5]")
combined_df_2.50 <-filter(combined_df, distance_class == "(2.5,2.75]")
combined_df_2.75 <-filter(combined_df, distance_class == "(2.75,3]")
combined_df_3.00 <-filter(combined_df, distance_class == "(3,3.25]")
combined_df_3.25 <-filter(combined_df, distance_class == "(3.25,3.5]")
combined_df_3.50 <-filter(combined_df, distance_class == "(3.5,3.75]")
combined_df_3.75 <-filter(combined_df, distance_class == "(3.75,4]")
ggplot(combined_df_0.00, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs year for townhouses 0 to 0.25km from train station", x="Year", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_0.00$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   50000  415000  560000  655293  780000 3300000
ggplot(combined_df_0.25, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs year for townhouses 0.25 to 0.50km from train station", x="Year", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_0.25$soldprice)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 5.000e+04 4.450e+05 5.900e+05 1.094e+06 7.850e+05 2.147e+09
ggplot(combined_df_0.50, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs year for townhouses 0.50 to 0.75km from train station", x="Year", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_0.50$soldprice)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     1650   420000   580000   647966   770000 15000000
ggplot(combined_df_0.75, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs year for townhouses 0.75 to 1.00km from train station", x="Year", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_0.75$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   60000  418375  545000  609335  712000 6203000
ggplot(combined_df_1.00, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs year for townhouses 1.00 to 1.25km from train station", x="Year", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_1.00$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     575  390000  535000  603137  710000 4400000
ggplot(combined_df_1.25, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs year for townhouses 1.25 to 1.50km from train station", x="Year", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_1.25$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   61325  377000  525000  570984  685000 4840000
ggplot(combined_df_1.50, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs year for townhouses 1.50 to 1.75km from train station", x="Year", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_1.50$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   92000  365500  510000  549864  640000 2812000
ggplot(combined_df_1.75, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs year for townhouses 1.75 to 2.00km from train station", x="Year", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_1.75$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  100000  355000  505000  564044  680000 3100000
ggplot(combined_df_2.00, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs year for townhouses 2.00 to 2.25km from train station", x="Year", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_2.00$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  132500  370000  491000  528737  620000 2430000
ggplot(combined_df_2.25, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs year for townhouses 2.25 to 2.50km from train station", x="Year", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_2.25$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  190000  382500  550000  584580  670000 5346000
ggplot(combined_df_2.50, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs year for townhouses 2.50 to 2.75km from train station", x="Year", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_2.50$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 
ggplot(combined_df_2.75, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs year for townhouses 2.75 to 3.00km from train station", x="Year", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_2.75$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 
ggplot(combined_df_3.00, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs year for townhouses 3.00 to 3.25km from train station", x="Year", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_3.00$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  180000  315000  462500  496268  569750 1777000
ggplot(combined_df_3.25, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs year for townhouses 3.25 to 3.75km from train station", x="Year", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_3.25$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  230000  322316  487500  464311  552750 1125000
ggplot(combined_df_3.50, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs year for townhouses 3.50 to 3.75km from train station", x="Year", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_3.50$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  250000  300000  355000  386685  441000  664000
ggplot(combined_df_3.75, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs year for townhouses 3.75 to 4.00km from train station", x="Year", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_3.75$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 
ggplot(combined_df, aes(x = Year, y = soldprice/100000))+
    geom_point(aes(color=distance_class)) +
    labs(title = "Sold Price over Years", x="Year", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
    theme_bw()+
    theme(axis.text.x = element_text(angle=45,hjust=1))+
    theme(plot.title = element_text(hjust=0.25))

ggplot(combined_df, aes(x = Year, y = soldprice/100000))+
    geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
    labs(title = "Sold Price over Years", x="Year", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
    theme_bw()+
    theme(axis.text.x = element_text(angle=45,hjust=1))+
    theme(plot.title = element_text(hjust=0.25))

ggplot(combined_df, aes(x = factor(bedroom), y = soldprice/100000))+
    geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
    labs(title = "Sold Price for Different Numbers of Bedrooms", x="Number of Bedrooms", y="Selling Price (x$100000)")+
    theme_bw()+
    theme(axis.text.x = element_text(angle=45,hjust=1))+
    theme(plot.title = element_text(hjust=0.25))

ggplot(combined_df, aes(x = factor(bathroom), y = soldprice/100000))+
    geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
    labs(title = "Sold Price for Different Numbers of Bathrooms", x="Number of Bathrooms", y="Selling Price (x$100000)")+
    theme_bw()+
    theme(axis.text.x = element_text(angle=45,hjust=1))+
    theme(plot.title = element_text(hjust=0.25))

ggplot(combined_df, aes(x = factor(carspace), y = soldprice/100000))+
    geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
    labs(title = "Sold Price for Different Numbers of Carspaces", x="Number of Carspaces", y="Selling Price (x$100000)")+
    theme_bw()+
    theme(axis.text.x = element_text(angle=45,hjust=1))+
    theme(plot.title = element_text(hjust=0.25))

Two added graphs from Jasmine Mon Apr 17, 2023 7 pm

q1 <- quantile(combined_df$soldprice, 0.25)
q3 <- quantile(combined_df$soldprice, 0.75)
iqr <- q3 - q1
combine <- subset(combined_df, soldprice >= q1 - 1.5*iqr & soldprice <= q3 + 1.5*iqr)

Q1 <- quantile(combined_df$`distance_to_train_station(km)`, 0.25)
Q3 <- quantile(combined_df$`distance_to_train_station(km)`, 0.75)
IQR <- Q3 - Q1
# What I've changed here at 7:05 AM, Apr 17, 2023, Monday
# `subset(combined_df ...` <- `subset(combined, ...`
combined <- subset(combined_df, `distance_to_train_station(km)` >= Q1 - 1.5*IQR & `distance_to_train_station(km)` <= Q3 + 1.5*IQR)

ggplot(combined, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))+
  theme(plot.title = element_text(hjust=0.25))

model <- lm(soldprice ~ `distance_to_train_station(km)`, data = combined)
plot(combined$"distance_to_train_station(km)", resid(model), main = "Residual Plot", xlab = "Distance to train station (km)", ylab = "Residuals", cex=0.15)
abline(h=0)